
__author__ = "YaoJiaWen";

import requests;
from bs4 import BeautifulSoup;
import re;
from entity.double_color_ball import *;
from datetime import datetime;

'''
双色球爬虫
http://kaijiang.zhcw.com/zhcw/html/ssq/list.html
http://www.sfac.xyz/economy/lottery
http://538b537e25.zicp.vip:13595/api/economy/bicolorSpheres
'''

headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}

# 获取双色球所有分页数据
def get_dcb_page_list_data(page_count=150):
    # 构造出分页 url list
    url_list = list("http://kaijiang.zhcw.com/zhcw/html/ssq/list_%d.html" % page
                    for page in range(2, page_count + 1));
    url_list.insert(0, "http://kaijiang.zhcw.com/zhcw/html/ssq/list.html");

    # 循环爬取每一页数据
    dcb_list = [];
    for url in url_list:
        dcb_list += get_dcb_page_data(url);

    return dcb_list;

# 获取双色球单页数据
def get_dcb_page_data(url):
    print(url);

    # 单张页面双色球 list 数据
    page_data = [];

    # 通过 requests 向目标地址发送请求，获取响应内容
    response = requests.get(url=url, headers=headers);
    if response.status_code == 200:
        response.encoding = response.apparent_encoding;

        # 解析响应内容
        bs = BeautifulSoup(markup=response.text, features="html.parser");
        tr_list = bs.find_all(name="tr");
        for tr in tr_list:
            # 过滤无用数据
            if len(re.findall('<td align="center">(.*?)</td>', str(tr))) == 0:
                continue;

            sales = re.findall('<td><strong>(.*?)</strong></td>', str(tr))[0].replace(" ", "").replace(",", "");
            dcb= Double_Color_Ball(
                id=0,
                issue_no=re.findall('<td align="center">(.*?)</td>', str(tr))[1],
                award_date=re.findall('<td align="center">(.*?)</td>', str(tr))[0],
                red_ball=" ".join(re.findall('<em class="rr">(.*?)</em>', str(tr))),
                blue_ball=re.findall('<em>(.*?)</em>', str(tr))[0],
                total_sales=sales if len(sales) > 0 else 0,
                first_prize_number=re.findall('<td align="left" style="color:#999;"><strong>(.*?)</strong>', str(tr))[0],
                second_prize_number=re.findall('<td align="center"><strong class="rc">(.*?)</strong></td>', str(tr))[0],
                create_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            );
            insert_(dcb, "issue_no");
            page_data.append(dcb);

    return page_data;

if __name__ == '__main__':
    get_dcb_page_list_data(150);

